library(tidyverse)
library(reticulate)
# Replace this with your conda environment containking sklearn, pandas, & numpy
use_condaenv("py3.8", required = TRUE)
hr_data_tbl <- read_csv("data/HRDataset_v13.csv")
hr_data_tbl
## # A tibble: 401 x 35
## Employee_Name EmpID MarriedID MaritalStatusID GenderID EmpStatusID DeptID
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Brown, Mia 1.10e9 1 1 0 1 1
## 2 LaRotonda, W… 1.11e9 0 2 1 1 1
## 3 Steans, Tyro… 1.30e9 0 0 1 1 1
## 4 Howard, Este… 1.21e9 1 1 0 1 1
## 5 Singh, Nan 1.31e9 0 0 0 1 1
## 6 Smith, Leigh… 7.11e8 1 1 0 5 1
## 7 Bunbury, Jes… 1.50e9 1 1 0 5 6
## 8 Carter, Mich… 1.40e9 0 0 0 1 6
## 9 Dietrich, Je… 1.41e9 0 0 0 1 6
## 10 Digitale, Al… 1.31e9 1 1 1 1 6
## # … with 391 more rows, and 28 more variables: PerfScoreID <dbl>,
## # FromDiversityJobFairID <dbl>, PayRate <dbl>, Termd <dbl>, PositionID <dbl>,
## # Position <chr>, State <chr>, Zip <chr>, DOB <chr>, Sex <chr>,
## # MaritalDesc <chr>, CitizenDesc <chr>, HispanicLatino <chr>, RaceDesc <chr>,
## # DateofHire <chr>, DateofTermination <chr>, TermReason <chr>,
## # EmploymentStatus <chr>, Department <chr>, ManagerName <chr>,
## # ManagerID <dbl>, RecruitmentSource <chr>, PerformanceScore <chr>,
## # EngagementSurvey <dbl>, EmpSatisfaction <dbl>, SpecialProjectsCount <dbl>,
## # LastPerformanceReview_Date <chr>, DaysLateLast30 <dbl>
hr_data_tbl %>% glimpse()
## Rows: 401
## Columns: 35
## $ Employee_Name <chr> "Brown, Mia", "LaRotonda, William", "Stean…
## $ EmpID <dbl> 1103024456, 1106026572, 1302053333, 121105…
## $ MarriedID <dbl> 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, …
## $ MaritalStatusID <dbl> 1, 2, 0, 1, 0, 1, 1, 0, 0, 1, 0, 4, 1, 0, …
## $ GenderID <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, …
## $ EmpStatusID <dbl> 1, 1, 1, 1, 1, 5, 5, 1, 1, 1, 1, 4, 1, 5, …
## $ DeptID <dbl> 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 6, 6, 6, 6, …
## $ PerfScoreID <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, …
## $ FromDiversityJobFairID <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ PayRate <dbl> 28.50, 23.00, 29.00, 21.50, 16.56, 20.50, …
## $ Termd <dbl> 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, …
## $ PositionID <dbl> 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Position <chr> "Accountant I", "Accountant I", "Accountan…
## $ State <chr> "MA", "MA", "MA", "MA", "MA", "MA", "VA", …
## $ Zip <chr> "01450", "01460", "02703", "02170", "02330…
## $ DOB <chr> "11/24/87", "04/26/84", "09/01/86", "09/16…
## $ Sex <chr> "F", "M", "M", "F", "F", "F", "F", "F", "F…
## $ MaritalDesc <chr> "Married", "Divorced", "Single", "Married"…
## $ CitizenDesc <chr> "US Citizen", "US Citizen", "US Citizen", …
## $ HispanicLatino <chr> "No", "No", "No", "No", "No", "No", "No", …
## $ RaceDesc <chr> "Black or African American", "Black or Afr…
## $ DateofHire <chr> "10/27/2008", "1/6/2014", "9/29/2014", "2/…
## $ DateofTermination <chr> NA, NA, NA, "04/15/15", NA, "09/25/13", "0…
## $ TermReason <chr> "N/A - still employed", "N/A - still emplo…
## $ EmploymentStatus <chr> "Active", "Active", "Active", "Terminated …
## $ Department <chr> "Admin Offices", "Admin Offices", "Admin O…
## $ ManagerName <chr> "Brandon R. LeBlanc", "Brandon R. LeBlanc"…
## $ ManagerID <dbl> 1, 1, 1, 1, 1, 1, 17, 17, 17, 17, 17, 17, …
## $ RecruitmentSource <chr> "Diversity Job Fair", "Website Banner Ads"…
## $ PerformanceScore <chr> "Fully Meets", "Fully Meets", "Fully Meets…
## $ EngagementSurvey <dbl> 2.04, 5.00, 3.90, 3.24, 5.00, 3.80, 3.14, …
## $ EmpSatisfaction <dbl> 2, 4, 5, 3, 3, 4, 5, 5, 1, 5, 5, 4, 3, 5, …
## $ SpecialProjectsCount <dbl> 6, 4, 5, 4, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ LastPerformanceReview_Date <chr> "1/15/2019", "1/17/2019", "1/18/2019", NA,…
## $ DaysLateLast30 <dbl> 0, 0, 0, NA, 0, NA, NA, 0, 0, 0, 0, NA, 0,…
library(DataExplorer)
hr_data_tbl %>% plot_missing()
hr_subset_tbl <- hr_data_tbl %>%
select(
# Employee
Employee_Name, GenderID, MaritalStatusID,
# Pay
PayRate,
# Department
Department # ManagerName,
# # Performance
# PerformanceScore, SpecialProjectsCount,
# # Surveys
# EngagementSurvey, EmpSatisfaction
) %>%
drop_na()
hr_subset_tbl %>% glimpse()
## Rows: 310
## Columns: 5
## $ Employee_Name <chr> "Brown, Mia", "LaRotonda, William", "Steans, Tyrone",…
## $ GenderID <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,…
## $ MaritalStatusID <dbl> 1, 2, 0, 1, 0, 1, 1, 0, 0, 1, 0, 4, 1, 0, 3, 0, 0, 1,…
## $ PayRate <dbl> 28.50, 23.00, 29.00, 21.50, 16.56, 20.50, 55.00, 55.0…
## $ Department <chr> "Admin Offices", "Admin Offices", "Admin Offices", "A…
hr_subset_tbl %>% plot_missing()
library(recipes)
rec_obj <- recipe(~ ., hr_subset_tbl) %>%
step_rm(Employee_Name) %>%
step_mutate_at(GenderID, MaritalStatusID, fn = as.factor) %>%
step_log(PayRate) %>%
step_dummy(all_nominal()) %>%
prep()
rec_obj
## Data Recipe
##
## Inputs:
##
## role #variables
## predictor 5
##
## Training data contained 310 data points and no missing data.
##
## Operations:
##
## Variables removed Employee_Name [trained]
## Variable mutation for GenderID, MaritalStatusID [trained]
## Log transformation on PayRate [trained]
## Dummy variables from GenderID, MaritalStatusID, Department [trained]
hr_subset_processed_tbl <- juice(rec_obj)
hr_subset_processed_tbl %>% glimpse()
## Rows: 310
## Columns: 11
## $ PayRate <dbl> 3.349904, 3.135494, 3.367296, 3.06805…
## $ GenderID_X1 <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1…
## $ MaritalStatusID_X1 <dbl> 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1…
## $ MaritalStatusID_X2 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ MaritalStatusID_X3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ MaritalStatusID_X4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Department_Executive.Office <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Department_IT.IS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Department_Production <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Department_Sales <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1…
## $ Department_Software.Engineering <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
# Prep for Python
X <- as.matrix(hr_subset_processed_tbl)
employee_names <- hr_subset_tbl$Employee_Name
# Data Manipulation
import pandas as pd
import numpy as np
r.X
## array([[3.34990409, 0. , 1. , ..., 0. , 0. ,
## 0. ],
## [3.13549422, 1. , 0. , ..., 0. , 0. ,
## 0. ],
## [3.36729583, 1. , 0. , ..., 0. , 0. ,
## 0. ],
## ...,
## [3.98527347, 1. , 1. , ..., 0. , 0. ,
## 0. ],
## [3.97029191, 1. , 0. , ..., 0. , 0. ,
## 0. ],
## [4.01096295, 0. , 0. , ..., 0. , 0. ,
## 0. ]])
pd.Series(r.employee_names)
## 0 Brown, Mia
## 1 LaRotonda, William
## 2 Steans, Tyrone
## 3 Howard, Estelle
## 4 Singh, Nan
## ...
## 305 Daniele, Ann
## 306 Lajiri, Jyoti
## 307 Semizoglou, Jeremiah
## 308 South, Joe
## 309 Warfield, Sarah
## Length: 310, dtype: object
# Machine Learning
from sklearn.cluster import AffinityPropagation
af = AffinityPropagation().fit(r.X)
af
## AffinityPropagation(affinity='euclidean', convergence_iter=15, copy=True,
## damping=0.5, max_iter=200, preference=None, verbose=False)
af.cluster_centers_indices_
## array([ 7, 10, 12, 18, 35, 40, 45, 48, 78, 83, 94, 97, 114,
## 137, 179, 188, 190, 192, 224, 235, 286, 293, 297, 300])
cluster_assignments_af = af.labels_
cluster_assignments_af
## array([23, 17, 19, 23, 13, 23, 3, 0, 0, 2, 1, 0, 2, 1, 1, 1, 1,
## 3, 3, 1, 2, 0, 1, 0, 2, 1, 1, 2, 0, 0, 3, 1, 0, 7,
## 20, 4, 4, 4, 5, 5, 5, 5, 4, 5, 7, 6, 4, 7, 7, 4, 6,
## 4, 5, 18, 3, 20, 20, 6, 4, 20, 20, 5, 5, 7, 6, 4, 7, 5,
## 20, 7, 7, 4, 4, 23, 8, 19, 19, 18, 8, 11, 11, 9, 19, 9, 9,
## 8, 9, 9, 18, 16, 13, 11, 11, 13, 10, 10, 12, 11, 18, 11, 10, 13,
## 11, 18, 10, 11, 10, 10, 17, 13, 11, 10, 14, 11, 12, 10, 15, 18, 10,
## 10, 16, 13, 16, 11, 13, 11, 10, 13, 16, 14, 15, 18, 12, 10, 18, 13,
## 10, 13, 15, 11, 10, 10, 15, 10, 17, 10, 13, 13, 16, 16, 10, 16, 11,
## 18, 15, 14, 16, 13, 11, 16, 13, 11, 10, 12, 15, 18, 16, 18, 13, 16,
## 16, 13, 11, 10, 10, 18, 15, 13, 11, 14, 16, 10, 16, 10, 15, 13, 10,
## 13, 15, 15, 16, 15, 17, 16, 17, 13, 13, 16, 10, 10, 11, 16, 10, 13,
## 16, 17, 16, 11, 13, 15, 16, 16, 11, 10, 11, 13, 10, 13, 16, 19, 10,
## 13, 10, 19, 18, 19, 15, 10, 9, 18, 10, 11, 18, 19, 10, 19, 11, 10,
## 15, 11, 15, 18, 18, 10, 18, 18, 18, 16, 9, 10, 14, 19, 18, 11, 19,
## 11, 11, 10, 15, 18, 9, 16, 9, 19, 14, 11, 18, 11, 19, 18, 14, 9,
## 18, 19, 18, 16, 18, 14, 10, 10, 12, 0, 0, 1, 5, 20, 20, 2, 2,
## 21, 21, 21, 21, 21, 22, 21, 22, 22, 22, 23, 23, 4, 5, 20, 5, 7,
## 4, 4, 20, 5])
from sklearn.cluster import DBSCAN
db = DBSCAN(min_samples=5).fit(r.X)
db
## DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
## metric_params=None, min_samples=5, n_jobs=None, p=None)
cluster_assignments_db = db.labels_
cluster_assignments_db
## array([ 0, -1, -1, 0, -1, 0, 1, 2, 2, 3, 4, -1, 3, 4, -1, 4, 4,
## 1, 1, 4, 3, 2, 4, 2, 3, 4, 4, 3, 2, 2, 1, -1, 2, 5,
## 6, 7, 7, 7, 8, 8, 8, 8, 7, 8, 5, -1, 7, 5, 5, 7, -1,
## 7, 8, -1, 1, 6, 6, -1, 7, 6, 6, 8, 8, 5, -1, 7, 5, -1,
## 6, 5, 5, 7, 7, -1, -1, -1, -1, -1, -1, 9, 9, 10, -1, 10, 10,
## -1, 10, 10, 11, 12, 11, 9, 9, 11, 10, 10, -1, 9, 11, 9, 10, 11,
## 9, 11, 10, 9, 10, 10, 13, 11, 9, 10, 14, 9, -1, 10, 15, 11, 10,
## 10, 12, 11, 12, 9, 11, 9, 10, 11, 12, 14, 15, 11, -1, 10, 11, 11,
## 10, 11, 15, 9, 10, 10, 15, 10, 13, 10, 11, 11, 12, 12, 10, 12, 9,
## 11, 15, 14, 12, 11, 9, 12, 11, 9, 10, -1, 15, 11, 12, 11, 11, 12,
## 12, 11, 9, 10, 10, 11, 15, 11, 9, 14, 12, 10, 12, 10, 15, 11, 10,
## 11, 15, 15, 12, 15, 13, 12, 13, 11, 11, 12, 10, 10, 9, 12, 10, 11,
## 12, 13, 12, 9, 11, 15, 12, 12, 9, 10, 9, 11, 10, 11, 12, 12, 10,
## 11, 10, 12, 11, 12, 15, 10, 10, 11, 10, 9, 11, 12, 10, 12, 9, 10,
## 15, 9, 15, 11, 11, 10, 11, 11, 11, 12, 10, 10, 14, 12, 11, 9, 12,
## 9, 9, 10, 15, 11, 10, 12, 10, 12, 14, 9, 11, 9, 12, 11, 14, 10,
## -1, -1, 11, 12, 11, 14, 10, 10, -1, 2, 2, -1, 8, 6, 6, -1, -1,
## -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 7, 8, 6, 8, 5,
## 7, 7, 6, -1])
Needed to create a reduced representation of the original data in 2-D space.
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2, random_state=123).fit_transform(r.X)
pd.DataFrame(X_embedded)
## 0 1
## 0 10.122051 11.739521
## 1 -7.448895 -7.267364
## 2 -7.535513 -1.628443
## 3 10.195205 11.912298
## 4 -5.360659 -20.355145
## .. ... ...
## 305 13.353748 6.259142
## 306 11.660574 3.584381
## 307 11.434317 3.715972
## 308 14.006830 2.334109
## 309 15.658013 5.544502
##
## [310 rows x 2 columns]
# Affinity Propogation
py$cluster_assignments_af
## [1] 23 17 19 23 13 23 3 0 0 2 1 0 2 1 1 1 1 3 3 1 2 0 1 0 2
## [26] 1 1 2 0 0 3 1 0 7 20 4 4 4 5 5 5 5 4 5 7 6 4 7 7 4
## [51] 6 4 5 18 3 20 20 6 4 20 20 5 5 7 6 4 7 5 20 7 7 4 4 23 8
## [76] 19 19 18 8 11 11 9 19 9 9 8 9 9 18 16 13 11 11 13 10 10 12 11 18 11
## [101] 10 13 11 18 10 11 10 10 17 13 11 10 14 11 12 10 15 18 10 10 16 13 16 11 13
## [126] 11 10 13 16 14 15 18 12 10 18 13 10 13 15 11 10 10 15 10 17 10 13 13 16 16
## [151] 10 16 11 18 15 14 16 13 11 16 13 11 10 12 15 18 16 18 13 16 16 13 11 10 10
## [176] 18 15 13 11 14 16 10 16 10 15 13 10 13 15 15 16 15 17 16 17 13 13 16 10 10
## [201] 11 16 10 13 16 17 16 11 13 15 16 16 11 10 11 13 10 13 16 19 10 13 10 19 18
## [226] 19 15 10 9 18 10 11 18 19 10 19 11 10 15 11 15 18 18 10 18 18 18 16 9 10
## [251] 14 19 18 11 19 11 11 10 15 18 9 16 9 19 14 11 18 11 19 18 14 9 18 19 18
## [276] 16 18 14 10 10 12 0 0 1 5 20 20 2 2 21 21 21 21 21 22 21 22 22 22 23
## [301] 23 4 5 20 5 7 4 4 20 5
# DBSCAN
py$cluster_assignments_db
## [1] 0 -1 -1 0 -1 0 1 2 2 3 4 -1 3 4 -1 4 4 1 1 4 3 2 4 2 3
## [26] 4 4 3 2 2 1 -1 2 5 6 7 7 7 8 8 8 8 7 8 5 -1 7 5 5 7
## [51] -1 7 8 -1 1 6 6 -1 7 6 6 8 8 5 -1 7 5 -1 6 5 5 7 7 -1 -1
## [76] -1 -1 -1 -1 9 9 10 -1 10 10 -1 10 10 11 12 11 9 9 11 10 10 -1 9 11 9
## [101] 10 11 9 11 10 9 10 10 13 11 9 10 14 9 -1 10 15 11 10 10 12 11 12 9 11
## [126] 9 10 11 12 14 15 11 -1 10 11 11 10 11 15 9 10 10 15 10 13 10 11 11 12 12
## [151] 10 12 9 11 15 14 12 11 9 12 11 9 10 -1 15 11 12 11 11 12 12 11 9 10 10
## [176] 11 15 11 9 14 12 10 12 10 15 11 10 11 15 15 12 15 13 12 13 11 11 12 10 10
## [201] 9 12 10 11 12 13 12 9 11 15 12 12 9 10 9 11 10 11 12 12 10 11 10 12 11
## [226] 12 15 10 10 11 10 9 11 12 10 12 9 10 15 9 15 11 11 10 11 11 11 12 10 10
## [251] 14 12 11 9 12 9 9 10 15 11 10 12 10 12 14 9 11 9 12 11 14 10 -1 -1 11
## [276] 12 11 14 10 10 -1 2 2 -1 8 6 6 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0
## [301] 0 7 8 6 8 5 7 7 6 -1
X_embedded_tbl <- py$X_embedded %>% as_tibble()
X_embedded_tbl
## # A tibble: 310 x 2
## V1 V2
## <dbl> <dbl>
## 1 10.1 11.7
## 2 -7.45 -7.27
## 3 -7.54 -1.63
## 4 10.2 11.9
## 5 -5.36 -20.4
## 6 10.2 11.9
## 7 5.33 5.02
## 8 3.53 4.29
## 9 3.53 4.29
## 10 6.15 3.28
## # … with 300 more rows
library(plotly)
library(tidyquant)
employee_clustering_tbl <- tibble(
Employee_Name = employee_names,
cluster_af = py$cluster_assignments_af,
cluster_db = py$cluster_assignments_db,
) %>%
bind_cols(X_embedded_tbl) %>%
left_join(hr_data_tbl)
employee_clustering_tbl
## # A tibble: 310 x 39
## Employee_Name cluster_af cluster_db V1 V2 EmpID MarriedID
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Brown, Mia 23 0 10.1 11.7 1.10e9 1
## 2 LaRotonda, W… 17 -1 -7.45 -7.27 1.11e9 0
## 3 Steans, Tyro… 19 -1 -7.54 -1.63 1.30e9 0
## 4 Howard, Este… 23 0 10.2 11.9 1.21e9 1
## 5 Singh, Nan 13 -1 -5.36 -20.4 1.31e9 0
## 6 Smith, Leigh… 23 0 10.2 11.9 7.11e8 1
## 7 Bunbury, Jes… 3 1 5.33 5.02 1.50e9 1
## 8 Carter, Mich… 0 2 3.53 4.29 1.40e9 0
## 9 Dietrich, Je… 0 2 3.53 4.29 1.41e9 0
## 10 Digitale, Al… 2 3 6.15 3.28 1.31e9 1
## # … with 300 more rows, and 32 more variables: MaritalStatusID <dbl>,
## # GenderID <dbl>, EmpStatusID <dbl>, DeptID <dbl>, PerfScoreID <dbl>,
## # FromDiversityJobFairID <dbl>, PayRate <dbl>, Termd <dbl>, PositionID <dbl>,
## # Position <chr>, State <chr>, Zip <chr>, DOB <chr>, Sex <chr>,
## # MaritalDesc <chr>, CitizenDesc <chr>, HispanicLatino <chr>, RaceDesc <chr>,
## # DateofHire <chr>, DateofTermination <chr>, TermReason <chr>,
## # EmploymentStatus <chr>, Department <chr>, ManagerName <chr>,
## # ManagerID <dbl>, RecruitmentSource <chr>, PerformanceScore <chr>,
## # EngagementSurvey <dbl>, EmpSatisfaction <dbl>, SpecialProjectsCount <dbl>,
## # LastPerformanceReview_Date <chr>, DaysLateLast30 <dbl>
termination_rate_tbl <- employee_clustering_tbl %>%
select(cluster_db, Termd) %>%
group_by(cluster_db) %>%
summarise(
term_rate = sum(Termd) / length(Termd),
term_count = n()
) %>%
arrange(desc(term_rate))
termination_rate_tbl
## # A tibble: 17 x 3
## cluster_db term_rate term_count
## <dbl> <dbl> <int>
## 1 13 0.8 5
## 2 15 0.625 16
## 3 9 0.467 30
## 4 10 0.46 50
## 5 0 0.4 5
## 6 11 0.327 49
## 7 12 0.314 35
## 8 7 0.286 14
## 9 8 0.273 11
## 10 -1 0.268 41
## 11 1 0.2 5
## 12 4 0.125 8
## 13 14 0.125 8
## 14 2 0.111 9
## 15 6 0.1 10
## 16 3 0 5
## 17 5 0 9
g <- termination_rate_tbl %>%
mutate(cluster_db = as_factor(cluster_db) %>% fct_reorder(term_count)) %>%
ggplot(aes(term_count, cluster_db)) +
geom_col(aes(fill = term_rate)) +
theme_tq() +
labs(title = "Termination Rate by Employee Cluster",
fill = "Term. Rate", x = "Termination Count", y = "Cluster Assignment")
ggplotly(g)
g <- employee_clustering_tbl %>%
left_join(termination_rate_tbl) %>%
mutate(description = str_glue("{Employee_Name}
Position = {Position}
MaritalDesc = {MaritalDesc}
Sex = {Sex}
Race = {RaceDesc}
EmpStatusID = {EmpStatusID}
PayRate = {PayRate}
Terminated = {Termd}
Term Reason = {TermReason}
Cluster Term Rate: {scales::percent(term_rate)}
Cluster Term Count: {term_count}
")
) %>%
select(Employee_Name:V2, description, Termd,
term_rate, term_count) %>%
ggplot(aes(V1, V2, color = factor(cluster_db))) +
geom_point(aes(text = description, size = term_rate), alpha = 0.5) +
scale_color_tq() +
theme_tq() +
# theme(legend.position = "none") +
labs(title = "Employee Cluster Assignments", color = "Cluster")
ggplotly(g)
# Bonus #1!!!
source_python("py/clustering.py")
source_python("py/tsne.py")
# Calls def cluster_dbscan(...)
cluster_dbscan(X)
## [1] 0 -1 -1 0 -1 0 1 2 2 3 4 -1 3 4 -1 4 4 1 1 4 3 2 4 2 3
## [26] 4 4 3 2 2 1 -1 2 5 6 7 7 7 8 8 8 8 7 8 5 -1 7 5 5 7
## [51] -1 7 8 -1 1 6 6 -1 7 6 6 8 8 5 -1 7 5 -1 6 5 5 7 7 -1 -1
## [76] -1 -1 -1 -1 9 9 10 -1 10 10 -1 10 10 11 12 11 9 9 11 10 10 -1 9 11 9
## [101] 10 11 9 11 10 9 10 10 13 11 9 10 14 9 -1 10 15 11 10 10 12 11 12 9 11
## [126] 9 10 11 12 14 15 11 -1 10 11 11 10 11 15 9 10 10 15 10 13 10 11 11 12 12
## [151] 10 12 9 11 15 14 12 11 9 12 11 9 10 -1 15 11 12 11 11 12 12 11 9 10 10
## [176] 11 15 11 9 14 12 10 12 10 15 11 10 11 15 15 12 15 13 12 13 11 11 12 10 10
## [201] 9 12 10 11 12 13 12 9 11 15 12 12 9 10 9 11 10 11 12 12 10 11 10 12 11
## [226] 12 15 10 10 11 10 9 11 12 10 12 9 10 15 9 15 11 11 10 11 11 11 12 10 10
## [251] 14 12 11 9 12 9 9 10 15 11 10 12 10 12 14 9 11 9 12 11 14 10 -1 -1 11
## [276] 12 11 14 10 10 -1 2 2 -1 8 6 6 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0
## [301] 0 7 8 6 8 5 7 7 6 -1
# calls def tsne_embedding
tsne_embedding(X) %>% head()
## [,1] [,2]
## [1,] 10.122051 11.739521
## [2,] -7.448895 -7.267364
## [3,] -7.535513 -1.628443
## [4,] 10.195205 11.912298
## [5,] -5.360659 -20.355145
## [6,] 10.207287 11.939018
# Bonus #2!!!